###############################################                                     
# Cause of Effect? Turnout in Hispanic Majority-Minority Districts
#  - John A. Henderson, Jasjeet S. Sekhon, and Rocio Titiunik
#  - Forthcoming in Political Analysis
#  - Replication file for <Table 1>
#  - April 14, 2016
###############################################

###########################################################################
#
## GENERATES TABLE 1: PLACEBO RESULTS FOR 2000 for Turnout Outcome
#
###########################################################################

options(width=150)
rm(list=ls())   

path='/local/'  
source(paste(path,'replicationPA/funs/headers.R',sep=''))
 
set.seed(57459)
  
# LOAD BASELINE AND MATCHED DATA

#WH2data <- read.csv(file = paste(path,"Baseline-afterMATCH.csv",sep=''), header = TRUE, stringsAsFactors = FALSE)
#WH2data <- read.dta(file = paste(path,"replicationPA/data/Baseline-afterMATCH-recreated.dta",sep=''))          
load(file = paste(path,"replicationPA/data/Baseline-afterMATCH-recreated.Rdata",sep=''))          
dim(WH2data)

WH2data=WH2data[which(WH2data$dfound_pair_newdata==1),]  
dim(WH2data)
      
#Sdata <-  read.dta(file = paste(path,"replicationPA/data/Matched-recreated.dta",sep=''))
load(file = paste(path,"replicationPA/data/Matched-recreated.Rdata",sep=''))   
dim(Sdata)
      
Sdata=Sdata[which(Sdata$dfound_pair_newdata==1),]
dim(Sdata)

WH2data$hvap=WH2data$pop_hispanic18 
WH2data$nhvap=WH2data$vap-WH2data$hvap

WH2data$phh_income99_39less  = WH2data$phh_income99_0to19    + WH2data$phh_income99_20to39
WH2data$phh_income99_40to74  = WH2data$phh_income99_40to59   + WH2data$phh_income99_60to74
WH2data$phh_income99_100plus = WH2data$phh_income99_100to199 + WH2data$phh_income99_200
WH2data$ppop_foreign         = WH2data$ppop_foreign_naturcit + WH2data$ppop_foreign_nocit

WH2data$y98_reg_nhisp		 = WH2data$y98_reg_tot - WH2data$y98_reg_hisp 
WH2data$y98_preg_nhisp		 = WH2data$y98_reg_nhisp/WH2data$y98_reg_tot
       
WH2data$y00_turn_nhisp       = WH2data$y00_turn_tot - WH2data$y00_turn_hisp
WH2data$y00_reg_nhisp        = WH2data$y00_reg_tot - WH2data$y00_reg_hisp

WH2data$y02_turn_nhisp       = WH2data$y02_turn_tot - WH2data$y02_turn_hisp
WH2data$y02_reg_nhisp        = WH2data$y02_reg_tot  - WH2data$y02_reg_hisp                   

WH2data$y04_turn_nhisp       = WH2data$y04_turn_tot - WH2data$y04_turn_hisp
WH2data$y04_reg_nhisp        = WH2data$y04_reg_tot  - WH2data$y04_reg_hisp                                                                           
                                                                          
WH2data$y06_turn_nhisp       = WH2data$y06_turn_tot - WH2data$y06_turn_hisp
WH2data$y06_reg_nhisp        = WH2data$y06_reg_tot  - WH2data$y06_reg_hisp

Sdata$hvap=Sdata$pop_hispanic18 
Sdata$nhvap=Sdata$vap-Sdata$hvap  

Sdata$phh_income99_39less  = Sdata$phh_income99_0to19    + Sdata$phh_income99_20to39
Sdata$phh_income99_40to74  = Sdata$phh_income99_40to59   + Sdata$phh_income99_60to74
Sdata$phh_income99_100plus = Sdata$phh_income99_100to199 + Sdata$phh_income99_200
Sdata$ppop_foreign         = Sdata$ppop_foreign_naturcit + Sdata$ppop_foreign_nocit

Sdata$y98_reg_nhisp			 = Sdata$y98_reg_tot - Sdata$y98_reg_hisp
Sdata$y98_preg_nhisp		 = Sdata$y98_reg_nhisp/Sdata$y98_reg_tot

Sdata$y00_turn_nhisp       = Sdata$y00_turn_tot - Sdata$y00_turn_hisp
Sdata$y00_reg_nhisp        = Sdata$y00_reg_tot - Sdata$y00_reg_hisp

Sdata$y02_turn_nhisp       = Sdata$y02_turn_tot - Sdata$y02_turn_hisp
Sdata$y02_reg_nhisp        = Sdata$y02_reg_tot  - Sdata$y02_reg_hisp                   

Sdata$y04_turn_nhisp       = Sdata$y04_turn_tot - Sdata$y04_turn_hisp
Sdata$y04_reg_nhisp        = Sdata$y04_reg_tot  - Sdata$y04_reg_hisp                                                                           
                                                                          
Sdata$y06_turn_nhisp       = Sdata$y06_turn_tot - Sdata$y06_turn_hisp
Sdata$y06_reg_nhisp        = Sdata$y06_reg_tot  - Sdata$y06_reg_hisp         

names(Sdata)[which(names(Sdata)=='tr')]='Tr'
names(WH2data)[which(names(WH2data)=='tr')]='Tr'


# CONSTRUCT NHISP 2000 OUTCOME FOR WH2DATA    

#WH2data$y04_reg_nhisp-WH2data$y04_reg_tot-WH2data$y04_reg_hisp
#WH2data$y04_turn_nhisp=WH2data$y04_turn_tot-WH2data$y04_turn_hisp
  
# CONSTRUCT NHISP 2000 OUTCOME FOR SDATA

#Sdata$y04_reg_nhisp=Sdata$y04_reg_tot-Sdata$y04_reg_hisp
#Sdata$y04_turn_nhisp=Sdata$y04_turn_tot-Sdata$y04_turn_hisp


# Conditioning/Balance set: group most important variables first
# HERE
imp.vars <- data.frame(
Sdata$vap,
Sdata$ppop_black18,
Sdata$ppop_hispanic18,
Sdata$phh_income99_39less,
Sdata$phh_income99_40to74,
Sdata$phh_income99_100plus,                   
Sdata$ppop_25_hsless,
Sdata$ppop_foreign, 
Sdata$ppop_foreign_naturcit,
Sdata$ppop_foreign_nocit
)               

# Registration variables: include only 1998
reg.vars <- data.frame(
Sdata$y98_preg_tot,                                              
Sdata$y98_preg_hisp, # no turnout info for 1998, so this is the closest we have to previous outcome
Sdata$y98_preg_dem,
Sdata$y98_preg_rep                       
)
dim(reg.vars)
cat("Final dimension of reg.vars: ", dim(reg.vars), "\n")

# Vote variables

vote.vars <- data.frame(
Sdata$y98_pvote_ussdem,        # statewide and local offices
Sdata$y98_pvote_govdem,
Sdata$y98_pvote_cngdem,                        
Sdata$y98_pvote_assdem,
Sdata$y98_pvote_atgdem
)

dim(vote.vars)
cat("Final dimension of vote.vars: ", dim(vote.vars), "\n")

# Population variables
pop.vars <- data.frame(
Sdata$ppop_fem,
Sdata$ppop_25to44,
Sdata$ppop_45to59,
Sdata$ppop_70older
)
cat("Final dimension of pop.vars: ", dim(pop.vars), "\n")


Xall <- data.frame (imp.vars,reg.vars, vote.vars, pop.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
X <- data.frame (imp.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
B <- X
dim(X)

    
# WH2data treatment 
Tr <- WH2data$Tr
m1=c()
m1$index.treated=which(Tr==T)
m1$index.control=which(Tr==F)           
print(table(Tr)) 
    
# Sdata treatment 
Tr <- Sdata$Tr
m2=c()
m2$index.treated=which(Tr==T)
m2$index.control=which(Tr==F)           
print(table(Tr))

        
######################################################
### Results matrix
######################################################

PLACcolnm <- c("Mean Tr","Mean Co","Diff means p-val", "KS test p-val")
PLACrownm <- c(
"Hispanic Turnout 2000 WH2", 
"Hispanic Registration 2000 WH2", 
"NonHispanic Turnout 2000 WH2",
"NonHispanic Registration 2000 WH2", 

"Hispanic Turnout 2000 AM",
"Hispanic Registration 2000 AM", 
"NonHispanic Turnout 2000 AM",
"NonHispanic Registration 2000 AM"
)

RESPLAC<- matrix(data=NA,nrow=length(PLACrownm), ncol=length(PLACcolnm), dimnames = list(PLACrownm,PLACcolnm))


######################################################
### Placebo 2000 results start here: 
######################################################

################################################################################
### Placebo results (i): Hispanic Registration in 2000 (as share of Hispanic VAP 
################################################################################

#BASELINE   
    
Y <- (WH2data$y00_reg_hisp/WH2data$hvap)
Y[which(WH2data$y00_reg_hisp==999999 | WH2data$hvap==0 | WH2data$hvap==999999)]<-NA     
Y[which(Y>1)] <- NA
Tr <- WH2data$Tr == 1    
          
TrWH2<-Tr[c(m1$index.treated,m1$index.control)]
YWH2<-Y[c(m1$index.treated,m1$index.control)]

#MATCHED

Y <- (Sdata$y00_reg_hisp/Sdata$hvap)
Y[which(Sdata$hvap==0 | Sdata$hvap==999999 | Sdata$y00_reg_hisp==999999)] <- NA
Y[which(Y>1)] <- NA

Y1<-Y[m2$index.treated]
Y0<-Y[m2$index.control]

Ym <- c(Y1,Y0)
Trm <- c(rep(TRUE,length(Y1)),rep(FALSE,length(Y0)))


########################
## WH2data: Most naive
#######################
cat("Top of results for WH2data \n")
t = t.test(YWH2[TrWH2], YWH2[!TrWH2])
cat("T-test:\n");  print(t)
ks = ks.boot(YWH2[TrWH2], YWH2[!TrWH2])
cat("KS Boot:\n"); 
print(ks)

RESPLAC["Hispanic Registration 2000 WH2","Mean Tr"] = t$estimate[1]
RESPLAC["Hispanic Registration 2000 WH2","Mean Co"] = t$estimate[2]
RESPLAC["Hispanic Registration 2000 WH2","Diff means p-val"] = t$p.value
RESPLAC["Hispanic Registration 2000 WH2","KS test p-val"] = ks$ks.boot.pvalue

########################
## Final matching: matching across triplets on the stacked data
#######################

cat("Top of results AFTER matching \n")
t=t.test(Ym[Trm], Ym[!Trm])
cat("T-test:\n");  print(t)
ks = ks.boot(Ym[Trm], Ym[!Trm])
cat("KS Boot:\n"); print(ks)

RESPLAC["Hispanic Registration 2000 AM","Mean Tr"] = t$estimate[1]
RESPLAC["Hispanic Registration 2000 AM","Mean Co"] = t$estimate[2]
RESPLAC["Hispanic Registration 2000 AM","Diff means p-val"] = t$p.value
RESPLAC["Hispanic Registration 2000 AM","KS test p-val"] = ks$ks.boot.pvalue


################################################################################
### Placebo results (ii): Hispanic Turnout in 2000 (as share of Hispanic Registration in 2000) 
################################################################################

#BASELINE


Y <- (WH2data$y00_turn_hisp/WH2data$y00_reg_hisp)
Y[which(WH2data$y00_turn_hisp==999999 | WH2data$y00_reg_hisp==0 | WH2data$y00_reg_hisp==999999)]<-NA
Tr <- WH2data$Tr == 1    

TrWH2<-Tr[c(m1$index.treated,m1$index.control)]
YWH2<-Y[c(m1$index.treated,m1$index.control)]


#MATCHED

Y <- (Sdata$y00_turn_hisp/Sdata$y00_reg_hisp)
Y[which(Sdata$y00_turn_hisp==999999| Sdata$y00_reg_hisp==0 | Sdata$y00_reg_hisp==999999)] <- NA

Y1<-Y[m2$index.treated]
Y0<-Y[m2$index.control]

Ym <- c(Y1,Y0)
Trm <- c(rep(TRUE,length(Y1)),rep(FALSE,length(Y0)))


########################
## WH2data: Most naive
#######################
cat("Top of results for WH2data \n")
t = t.test(YWH2[TrWH2], YWH2[!TrWH2])
cat("T-test:\n");  print(t)
ks = ks.boot(YWH2[TrWH2], YWH2[!TrWH2])
cat("KS Boot:\n"); 
print(ks)

RESPLAC["Hispanic Turnout 2000 WH2","Mean Tr"] = t$estimate[1]
RESPLAC["Hispanic Turnout 2000 WH2","Mean Co"] = t$estimate[2]
RESPLAC["Hispanic Turnout 2000 WH2","Diff means p-val"] = t$p.value
RESPLAC["Hispanic Turnout 2000 WH2","KS test p-val"] = ks$ks.boot.pvalue


########################
## Final matching: matching across triplets on the stacked data
#######################

cat("Top of results AFTER matching \n")
t=t.test(Ym[Trm], Ym[!Trm])
cat("T-test:\n");  print(t)
ks = ks.boot(Ym[Trm], Ym[!Trm])
cat("KS Boot:\n"); print(ks)

RESPLAC["Hispanic Turnout 2000 AM","Mean Tr"] = t$estimate[1]
RESPLAC["Hispanic Turnout 2000 AM","Mean Co"] = t$estimate[2]
RESPLAC["Hispanic Turnout 2000 AM","Diff means p-val"] = t$p.value
RESPLAC["Hispanic Turnout 2000 AM","KS test p-val"] = ks$ks.boot.pvalue

#cat("Hodges-Lehmann\n")
#cat("Using option exact=FALSE\n")
#a <- wilcox.exact(Ym[Trm], Ym[!Trm],exact=FALSE, paired=TRUE, conf.int=TRUE, conf.level=0.95)
#cat("H-L estimate:",a$estimate,"\n")
#cat("H-L CI:",a$conf.int,"\n")
#cat("H-L p.value:",a$p.value,"\n")

################################################################################
### Placebo results (iii): NonHispanic Turnout in 2000 (as share of NonHispanic Registration in 2000) 
################################################################################

#BASELINE


Y <- (WH2data$y00_turn_nhisp/WH2data$y00_reg_nhisp)
Y[which(WH2data$y00_turn_nhisp==999999 | WH2data$y00_reg_nhisp==0 | WH2data$y00_reg_nhisp==999999)]<-NA
Tr <- WH2data$Tr == 1    

TrWH2<-Tr[c(m1$index.treated,m1$index.control)]
YWH2<-Y[c(m1$index.treated,m1$index.control)]


#MATCHED

Y <- (Sdata$y00_turn_nhisp/Sdata$y00_reg_nhisp)
Y[which(Sdata$y00_reg_nhisp==0 | Sdata$y00_reg_nhisp==999999 | Sdata$y00_turn_nhisp==999999)] <- NA

Y1<-Y[m2$index.treated]
Y0<-Y[m2$index.control]

Ym <- c(Y1,Y0)
Trm <- c(rep(TRUE,length(Y1)),rep(FALSE,length(Y0)))
          

########################
## WH2data: Most naive
#######################
cat("Top of results for WH2data \n")
t = t.test(YWH2[TrWH2], YWH2[!TrWH2])
cat("T-test:\n");  print(t)
ks = ks.boot(YWH2[TrWH2], YWH2[!TrWH2])
cat("KS Boot:\n"); 
print(ks)

RESPLAC["NonHispanic Turnout 2000 WH2","Mean Tr"] = t$estimate[1]
RESPLAC["NonHispanic Turnout 2000 WH2","Mean Co"] = t$estimate[2]
RESPLAC["NonHispanic Turnout 2000 WH2","Diff means p-val"] = t$p.value
RESPLAC["NonHispanic Turnout 2000 WH2","KS test p-val"] = ks$ks.boot.pvalue



########################
## Final matching: matching across triplets on the stacked data
#######################

cat("Top of results AFTER matching \n")
t=t.test(Ym[Trm], Ym[!Trm])
cat("T-test:\n");  print(t)
ks = ks.boot(Ym[Trm], Ym[!Trm])
cat("KS Boot:\n"); print(ks)

RESPLAC["NonHispanic Turnout 2000 AM","Mean Tr"] = t$estimate[1]
RESPLAC["NonHispanic Turnout 2000 AM","Mean Co"] = t$estimate[2]
RESPLAC["NonHispanic Turnout 2000 AM","Diff means p-val"] = t$p.value
RESPLAC["NonHispanic Turnout 2000 AM","KS test p-val"] = ks$ks.boot.pvalue



################################################################################
### Placebo results (iv): NonHispanic Registration in 2000 (as share of NonHispanic VAP
################################################################################

#BASELINE


Y <- (WH2data$y00_reg_nhisp/WH2data$nhvap)
Y[which(WH2data$y00_reg_nhisp==999999 | WH2data$nhvap==0 | WH2data$nhvap==999999)]<-NA   
Y[which(Y>1)] <- NA
Tr <- WH2data$Tr == 1    
         
TrWH2<-Tr[c(m1$index.treated,m1$index.control)]
YWH2<-Y[c(m1$index.treated,m1$index.control)]



#MATCHED

Y <- (Sdata$y00_reg_nhisp/Sdata$nhvap)
Y[which(Sdata$nhvap==0 | Sdata$nhvap==999999 | Sdata$y00_reg_nhisp==999999)] <- NA
Y[which(Y>1)] <- NA

Y1<-Y[m2$index.treated]
Y0<-Y[m2$index.control]

Ym <- c(Y1,Y0)
Trm <- c(rep(TRUE,length(Y1)),rep(FALSE,length(Y0)))


########################
## WH2data: Most naive
#######################
cat("Top of results for WH2data \n")
t = t.test(YWH2[TrWH2], YWH2[!TrWH2])
cat("T-test:\n");  print(t)
ks = ks.boot(YWH2[TrWH2], YWH2[!TrWH2])
cat("KS Boot:\n"); 
print(ks)

RESPLAC["NonHispanic Registration 2000 WH2","Mean Tr"] = t$estimate[1]
RESPLAC["NonHispanic Registration 2000 WH2","Mean Co"] = t$estimate[2]
RESPLAC["NonHispanic Registration 2000 WH2","Diff means p-val"] = t$p.value
RESPLAC["NonHispanic Registration 2000 WH2","KS test p-val"] = ks$ks.boot.pvalue


########################
## Final matching: matching across triplets on the stacked data
#######################

cat("Top of results AFTER matching \n")
t=t.test(Ym[Trm], Ym[!Trm])
cat("T-test:\n");  print(t)
ks = ks.boot(Ym[Trm], Ym[!Trm])
cat("KS Boot:\n"); print(ks)

RESPLAC["NonHispanic Registration 2000 AM","Mean Tr"] = t$estimate[1]
RESPLAC["NonHispanic Registration 2000 AM","Mean Co"] = t$estimate[2]
RESPLAC["NonHispanic Registration 2000 AM","Diff means p-val"] = t$p.value
RESPLAC["NonHispanic Registration 2000 AM","KS test p-val"] = ks$ks.boot.pvalue

#cat("Hodges-Lehmann\n")
#cat("Using option exact=FALSE\n")
#a <- wilcox.exact(Ym[Trm], Ym[!Trm],exact=FALSE, paired=TRUE, conf.int=TRUE, conf.level=0.95)
#cat("H-L estimate:",a$estimate,"\n")
#cat("H-L CI:",a$conf.int,"\n")
#cat("H-L p.value:",a$p.value,"\n")


print(RESPLAC)

# END   
#####
####################### LATEX TABLES ##################
library(xtable)
library(stringr)      

baseline=RESPLAC[1:4,]
matched=RESPLAC[5:8,]

rownames(matched)=rownames(baseline)=c('Hispanic Turnout','Hispanic Registration','Non Hispanic Turnout','Non Hispanic Registration')     
  
print('--------------------')
print('Baseline')
print('--------------------')  
xtable(baseline,digits=c(1,3,3,2,2)) 

print('--------------------')
print('Matched')
print('--------------------')  
xtable(matched,digits=c(1,3,3,2,2))
                                              
################## LATEX TABLES ##################